{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from __future__ import print_function\n",
    "from __future__ import division\n",
    "\n",
    "from tqdm import *\n",
    "import csv\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import scipy\n",
    "import nltk\n",
    "import sklearn\n",
    "import random\n",
    "import re\n",
    "from sklearn.metrics import f1_score, precision_score, recall_score\n",
    "from sklearn.linear_model import LogisticRegression, LinearRegression\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "from sklearn.decomposition import PCA, RandomizedPCA\n",
    "from sklearn import svm\n",
    "from sklearn.grid_search import GridSearchCV, ParameterGrid\n",
    "from sklearn.pipeline import Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "path_to_csv_data = \"/home/felipe/auto-tagger/data/RawRCV1/csv/reuters-rcv1-full.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-5-4018a7caf08d>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m     38\u001b[0m     \u001b[0mseries\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mout_tags\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     39\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 40\u001b[1;33m     \u001b[0mpartial_Y_df\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mseries\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_dummies\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msep\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m','\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0muint8\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     41\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     42\u001b[0m     \u001b[1;32mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpartial_Y_df\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m/home/felipe/venv2-global/local/lib/python2.7/site-packages/pandas/core/strings.pyc\u001b[0m in \u001b[0;36mget_dummies\u001b[1;34m(self, sep)\u001b[0m\n\u001b[0;32m   1624\u001b[0m         \u001b[0mresult\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mstr_get_dummies\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msep\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1625\u001b[0m         return self._wrap_result(result, use_codes=(not self._is_categorical),\n\u001b[1;32m-> 1626\u001b[1;33m                                  name=name, expand=True)\n\u001b[0m\u001b[0;32m   1627\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1628\u001b[0m     \u001b[1;33m@\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstr_translate\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m/home/felipe/venv2-global/local/lib/python2.7/site-packages/pandas/core/strings.pyc\u001b[0m in \u001b[0;36m_wrap_result\u001b[1;34m(self, result, use_codes, name, expand)\u001b[0m\n\u001b[0;32m   1391\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mexpand\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1392\u001b[0m                 \u001b[0mcons\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_orig\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_constructor_expanddim\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1393\u001b[1;33m                 \u001b[1;32mreturn\u001b[0m \u001b[0mcons\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1394\u001b[0m             \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1395\u001b[0m                 \u001b[1;31m# Must be a Series\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m/home/felipe/venv2-global/local/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, data, index, columns, dtype, copy)\u001b[0m\n\u001b[0;32m    303\u001b[0m                     \u001b[1;32mif\u001b[0m \u001b[0mis_named_tuple\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    304\u001b[0m                         \u001b[0mcolumns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fields\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 305\u001b[1;33m                     \u001b[0marrays\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_to_arrays\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    306\u001b[0m                     \u001b[0mcolumns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_ensure_index\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    307\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m/home/felipe/venv2-global/local/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36m_to_arrays\u001b[1;34m(data, columns, coerce_float, dtype)\u001b[0m\n\u001b[0;32m   5550\u001b[0m         \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlmap\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtuple\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   5551\u001b[0m         return _list_to_arrays(data, columns, coerce_float=coerce_float,\n\u001b[1;32m-> 5552\u001b[1;33m                                dtype=dtype)\n\u001b[0m\u001b[0;32m   5553\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   5554\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m/home/felipe/venv2-global/local/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36m_list_to_arrays\u001b[1;34m(data, columns, coerce_float, dtype)\u001b[0m\n\u001b[0;32m   5607\u001b[0m         \u001b[0mcontent\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_object_array\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mT\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   5608\u001b[0m     return _convert_object_array(content, columns, dtype=dtype,\n\u001b[1;32m-> 5609\u001b[1;33m                                  coerce_float=coerce_float)\n\u001b[0m\u001b[0;32m   5610\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   5611\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m/home/felipe/venv2-global/local/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36m_convert_object_array\u001b[1;34m(content, columns, coerce_float, dtype)\u001b[0m\n\u001b[0;32m   5675\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0marr\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   5676\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 5677\u001b[1;33m     \u001b[0marrays\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mconvert\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marr\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0marr\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mcontent\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   5678\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   5679\u001b[0m     \u001b[1;32mreturn\u001b[0m \u001b[0marrays\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m/home/felipe/venv2-global/local/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36mconvert\u001b[1;34m(arr)\u001b[0m\n\u001b[0;32m   5671\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mconvert\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   5672\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mdtype\u001b[0m \u001b[1;33m!=\u001b[0m \u001b[0mobject\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mdtype\u001b[0m \u001b[1;33m!=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mobject\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 5673\u001b[1;33m             \u001b[0marr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmaybe_convert_objects\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtry_float\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcoerce_float\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   5674\u001b[0m             \u001b[0marr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_possibly_cast_to_datetime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   5675\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0marr\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "# documents = []\n",
    "# tags = []\n",
    "\n",
    "chunks = [\n",
    "    [0,100000],\n",
    "    [100001,200000],\n",
    "    [200001,300000],\n",
    "    [300001,400000],\n",
    "    [400001,500000],\n",
    "    [500001,600000],\n",
    "    [600001,700000],\n",
    "    [700001,800000],\n",
    "    [800001,900000]\n",
    "]\n",
    "    \n",
    "# http://stackoverflow.com/a/654046/436721\n",
    "def read_and_save(begin,end,tags):\n",
    "    with open(path_to_csv_data) as file:\n",
    "        reader = csv.reader(file,escapechar='\\\\')      \n",
    "              \n",
    "        for (i,line) in enumerate(reader):\n",
    "            \n",
    "            if(i < begin):\n",
    "                continue\n",
    "                \n",
    "            if(i >= end):\n",
    "                return tags\n",
    "            \n",
    "            (id,title,body,labels) = line\n",
    "            \n",
    "            text = title+\" \"+body\n",
    "                       \n",
    "#             documents.append(text)\n",
    "            tags.append(labels)\n",
    "\n",
    "for fromindex,toindex in chunks:\n",
    "    out_tags = read_and_save(fromindex,toindex,[])\n",
    "    series = pd.Series(out_tags)\n",
    "    \n",
    "    partial_Y_df = series.str.get_dummies(sep=',').astype(np.uint8)\n",
    "    \n",
    "    print(partial_Y_df.info())   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "(len(documents),len(tags))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "tags[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def preprocessor(string):\n",
    "    repl = re.sub('&lt;','',string)\n",
    "    repl = re.sub('<[^>]+>','',string)\n",
    "    return repl.lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "series = pd.Series(tags)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# split = series.str.split(',');split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "Y_df = series.str.get_dummies(sep=',').astype(np.uint8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "Y_df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "Y_df =pd.get_dummies(series.str.split(',')).astype(np.uint8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "Y_df.head(1).iloc[0].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
